## load relevant libraries and functions
require(knitr) # for knitting
library(Hmisc) # for descriptives
library(psych)
library(png) # for working with images
library(grid)
library(ggplotify) # for plotting
library(patchwork)
library(scales)
library(lme4) # for mixed effects models
library(ggeffects)
library(broom.mixed)
library(emmeans)
library(generics)
library(tidyverse) # for everything else
## set default code chunk options
knitr::opts_chunk$set(echo = T, warning = F, message = F)
## set default plot theme and colors
theme_set(theme_classic(base_size = 18))
## fix print width for knitted doc
options(width = 70)
## suppress warnings about grouping
options(dplyr.summarise.inform = F)
## set random seed
set.seed(1)
## set directories for plots and data outputs
figures_dir = '../figures/'
data_dir = '../data/'# Import data from https://science-of-learning-datasci.s3.us-west-2.amazonaws.com/mini-project2/coursekata-23-split80.zip
df.responses =
read.csv(paste0(data_dir,
"filtered80_responses_2023.csv"))
df.filtered_responses =
read.csv(paste0(data_dir,
"filtered80_eoc_2023.csv"))
# run quick data summary
skimr::skim(df.responses)| Name | df.responses |
| Number of rows | 381091 |
| Number of columns | 49 |
| _______________________ | |
| Column type frequency: | |
| character | 35 |
| logical | 5 |
| numeric | 9 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| institution_id | 0 | 1.00 | 36 | 36 | 0 | 1 | 0 |
| class_id | 0 | 1.00 | 36 | 36 | 0 | 5 | 0 |
| course_name | 0 | 1.00 | 25 | 25 | 0 | 1 | 0 |
| release | 0 | 1.00 | 4 | 9 | 0 | 5 | 0 |
| book | 0 | 1.00 | 43 | 53 | 0 | 2 | 0 |
| student_id | 0 | 1.00 | 36 | 36 | 0 | 560 | 0 |
| item_id | 0 | 1.00 | 5 | 25 | 0 | 484 | 0 |
| item_type | 0 | 1.00 | 4 | 10 | 0 | 2 | 0 |
| chapter | 0 | 1.00 | 26 | 59 | 0 | 9 | 0 |
| page | 0 | 1.00 | 11 | 70 | 0 | 87 | 0 |
| response | 203653 | 0.47 | 0 | 399 | 1 | 2183 | 0 |
| prompt | 0 | 1.00 | 0 | 1667 | 1578 | 647 | 0 |
| dt_submitted | 0 | 1.00 | 21 | 26 | 0 | 290124 | 0 |
| completes_page | 0 | 1.00 | 4 | 5 | 0 | 2 | 0 |
| user_agent | 0 | 1.00 | 0 | 184 | 124985 | 168 | 0 |
| lrn_session_id | 0 | 1.00 | 0 | 36 | 124985 | 166871 | 0 |
| lrn_response_id | 0 | 1.00 | 0 | 69 | 124985 | 256107 | 0 |
| lrn_activity_reference | 0 | 1.00 | 0 | 36 | 124985 | 403 | 0 |
| lrn_question_reference | 0 | 1.00 | 0 | 36 | 124985 | 502 | 0 |
| lrn_type | 0 | 1.00 | 0 | 21 | 124985 | 10 | 0 |
| lrn_dt_started | 0 | 1.00 | 0 | 23 | 124985 | 128867 | 0 |
| lrn_dt_saved | 0 | 1.00 | 0 | 23 | 124985 | 163870 | 0 |
| lrn_status | 0 | 1.00 | 0 | 9 | 124985 | 2 | 0 |
| lrn_response_json | 0 | 1.00 | 0 | 2354 | 124985 | 256107 | 0 |
| lrn_option_0 | 0 | 1.00 | 0 | 191 | 228678 | 227 | 0 |
| lrn_option_1 | 0 | 1.00 | 0 | 186 | 228678 | 230 | 0 |
| lrn_option_2 | 0 | 1.00 | 0 | 188 | 252958 | 201 | 0 |
| lrn_option_3 | 0 | 1.00 | 0 | 188 | 287799 | 147 | 0 |
| lrn_option_4 | 0 | 1.00 | 0 | 142 | 342344 | 50 | 0 |
| lrn_option_5 | 0 | 1.00 | 0 | 150 | 377421 | 7 | 0 |
| lrn_option_6 | 0 | 1.00 | 0 | 19 | 380048 | 3 | 0 |
| lrn_option_7 | 0 | 1.00 | 0 | 19 | 380048 | 3 | 0 |
| dt_submitted_processed | 0 | 1.00 | 20 | 20 | 0 | 282787 | 0 |
| lrn_dt_started_processed | 124985 | 0.67 | 20 | 20 | 0 | 128866 | 0 |
| lrn_dt_saved_processed | 124985 | 0.67 | 20 | 20 | 0 | 163869 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| lrn_option_9 | 381091 | 0 | NaN | : |
| lrn_option_10 | 381091 | 0 | NaN | : |
| lrn_option_11 | 381091 | 0 | NaN | : |
| lrn_dt_started_processed_ms | 381091 | 0 | NaN | : |
| lrn_dt_saved_processed_ms | 381091 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| lms_id | 0 | 1.00 | 5.709242e+04 | 2.723007e+04 | 2.190500e+04 | 3.030300e+04 | 5.193200e+04 | 8.673300e+04 | 1.128510e+05 | ▇▅▁▆▂ |
| points_possible | 90388 | 0.76 | 1.000000e+00 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | ▁▁▇▁▁ |
| points_earned | 90919 | 0.76 | 6.900000e-01 | 4.600000e-01 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | ▃▁▁▁▇ |
| attempt | 0 | 1.00 | 1.280000e+00 | 1.440000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 6.600000e+01 | ▇▁▁▁▁ |
| lrn_question_position | 124985 | 0.67 | 1.670000e+00 | 1.330000e+00 | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 | 2.000000e+00 | 1.000000e+01 | ▇▁▁▁▁ |
| lrn_option_8 | 380579 | 0.00 | 7.100000e+01 | 0.000000e+00 | 7.100000e+01 | 7.100000e+01 | 7.100000e+01 | 7.100000e+01 | 7.100000e+01 | ▁▁▇▁▁ |
| dt_submitted_processed_ms | 0 | 1.00 | 1.679858e+12 | 4.920459e+09 | 1.671239e+12 | 1.674963e+12 | 1.681055e+12 | 1.682923e+12 | 1.691129e+12 | ▆▅▇▂▃ |
| chapter_num | 0 | 1.00 | 5.040000e+00 | 2.400000e+00 | 1.000000e+00 | 3.000000e+00 | 5.000000e+00 | 7.000000e+00 | 9.000000e+00 | ▅▇▂▅▅ |
| page_num | 0 | 1.00 | 5.620000e+00 | 3.030000e+00 | 1.000000e+00 | 3.000000e+00 | 5.000000e+00 | 8.000000e+00 | 1.400000e+01 | ▇▇▆▃▁ |
| Name | df.filtered_responses |
| Number of rows | 4752 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| book | 0 | 1 | 43 | 53 | 0 | 2 | 0 |
| release | 0 | 1 | 4 | 9 | 0 | 5 | 0 |
| institution_id | 0 | 1 | 36 | 36 | 0 | 1 | 0 |
| class_id | 0 | 1 | 36 | 36 | 0 | 5 | 0 |
| student_id | 0 | 1 | 36 | 36 | 0 | 560 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 2375.50 | 1371.93 | 0 | 1187.75 | 2375.50 | 3563.25 | 4751 | ▇▇▇▇▇ |
| chapter | 0 | 1 | 4.95 | 2.59 | 1 | 3.00 | 5.00 | 7.00 | 9 | ▇▇▃▇▇ |
| score | 0 | 1 | 0.73 | 0.20 | 0 | 0.62 | 0.77 | 0.88 | 1 | ▁▁▃▆▇ |
df.reduced =
df.responses %>%
# calculate time spent on question (in mins)
mutate(dt_submitted = ymd_hms(dt_submitted),
lrn_dt_started = ymd_hms(lrn_dt_started),
time_spent = as.numeric(difftime(dt_submitted,
lrn_dt_started,
units = "mins")),
# make "completes_page" boolean
completes_page = completes_page == "true") %>%
group_by(book, release,
institution_id, class_id, student_id,
chapter_num, item_id) %>%
# count only last attempt
slice_max(order_by = attempt,
n = 1,
with_ties = F) %>%
ungroup() %>%
# keep only relevant cols
select(c(book, release,
institution_id, class_id,
student_id,
chapter = chapter_num, item_id,
time_spent, completes_page,
points_possible, points_earned, attempt))
# sanity check
df.reduced %>%
count(student_id, item_id) # should all be 1## # A tibble: 248,227 × 3
## student_id item_id n
## <chr> <chr> <int>
## 1 001824fb-a2fd-431d-aef6-7a1250d97a62 A4_Code_Shuffling_01 1
## 2 001824fb-a2fd-431d-aef6-7a1250d97a62 B1_Code_Empty_01 1
## 3 001824fb-a2fd-431d-aef6-7a1250d97a62 B1_Code_Empty_02 1
## 4 001824fb-a2fd-431d-aef6-7a1250d97a62 B1_Code_Median_01 1
## 5 001824fb-a2fd-431d-aef6-7a1250d97a62 B1_Code_Median_02 1
## 6 001824fb-a2fd-431d-aef6-7a1250d97a62 B1_Code_Median_03 1
## 7 001824fb-a2fd-431d-aef6-7a1250d97a62 B1_Code_Median_04 1
## 8 001824fb-a2fd-431d-aef6-7a1250d97a62 B1_Code_Median_05 1
## 9 001824fb-a2fd-431d-aef6-7a1250d97a62 B1_Code_Predictions_01 1
## 10 001824fb-a2fd-431d-aef6-7a1250d97a62 B1_Code_Predictions_02 1
## # ℹ 248,217 more rows
## # A tibble: 10 × 12
## book release institution_id class_id student_id chapter item_id
## <chr> <chr> <chr> <chr> <chr> <int> <chr>
## 1 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 Embedd…
## 2 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-0
## 3 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-1
## 4 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-10
## 5 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-11
## 6 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-12
## 7 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-13
## 8 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-14
## 9 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-15
## 10 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-16
## # ℹ 5 more variables: time_spent <dbl>, completes_page <lgl>,
## # points_possible <int>, points_earned <int>, attempt <int>
df.summary =
df.reduced %>%
# group % correct by book, release, institution, class, student, and chapter
group_by(book, release,
institution_id, class_id, student_id,
chapter) %>%
# calculate % correct across chapter
summarise(score = mean(points_earned, na.rm = T),
# calculate % of pages fully completed across chapter
chapter_pages_completed = mean(completes_page, na.rm = T),
# calculate time spent across chapter
time_spent = mean(time_spent, na.rm = T),
# calculate average number of attempts across chapter
no_attempts = mean(attempt, na.rm = T),
.groups = "drop_last") %>%
arrange(book, release,
institution_id, class_id, student_id,
chapter) %>%
group_by(book, release, institution_id, class_id, student_id) %>%
# add column for % correct on preceding chapter
mutate(score_prev_chapter = lag(score)) %>% # NA for the first chapter
ungroup()
# sanity check
df.summary %>%
count(student_id, chapter) # should all be 1## # A tibble: 4,841 × 3
## student_id chapter n
## <chr> <int> <int>
## 1 001824fb-a2fd-431d-aef6-7a1250d97a62 1 1
## 2 001824fb-a2fd-431d-aef6-7a1250d97a62 2 1
## 3 001824fb-a2fd-431d-aef6-7a1250d97a62 3 1
## 4 001824fb-a2fd-431d-aef6-7a1250d97a62 4 1
## 5 001824fb-a2fd-431d-aef6-7a1250d97a62 5 1
## 6 001824fb-a2fd-431d-aef6-7a1250d97a62 6 1
## 7 001824fb-a2fd-431d-aef6-7a1250d97a62 7 1
## 8 001824fb-a2fd-431d-aef6-7a1250d97a62 8 1
## 9 001824fb-a2fd-431d-aef6-7a1250d97a62 9 1
## 10 0096491a-77bf-4e79-b906-fc7a8e5e57a6 1 1
## # ℹ 4,831 more rows
## # A tibble: 10 × 12
## book release institution_id class_id student_id chapter item_id
## <chr> <chr> <chr> <chr> <chr> <int> <chr>
## 1 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 Embedd…
## 2 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-0
## 3 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-1
## 4 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-10
## 5 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-11
## 6 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-12
## 7 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-13
## 8 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-14
## 9 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-15
## 10 College… v5.0 292cff87-3c74… b7489f4… 0096491a-… 1 ch1-16
## # ℹ 5 more variables: time_spent <dbl>, completes_page <lgl>,
## # points_possible <int>, points_earned <int>, attempt <int>
We have a total of 9 chapters, with data from N = 570 students. Not all students answered all questions in all chapters, so we get a range of N = 526 to 562. The proportion of correct responses ranges from 71.4% to 97.8% and seems to decline with chapter number (i.e., more advanced chapters are harder). The SD ranges from 17% to 5%, also increasing with chapter number (i.e., there is more variability in more advanced chapters).
# Print descriptive stats: For each chapter, compute % correct (mean), SD, and SE
df.descriptives =
df.summary %>%
group_by(chapter) %>%
# compute mean
dplyr::summarise(mean_correct = mean(score,
na.rm = T),
# compute SD
sd_correct = sd(score,
na.rm = T),
# compute number of responses
n = n(),
# compute SE (SD / sqrt of n)
se_correct = sd_correct/sqrt(n)) %>%
ungroup()
# print
View(df.descriptives)Below, we evaluate means and SDs by chapter and book:
# Print descriptive stats: For each chapter *and book*, compute % correct (mean), SD, and SE
df.descriptives2 =
df.summary %>%
group_by(chapter, book) %>%
# compute mean
dplyr::summarise(mean_correct = mean(score,
na.rm = T),
# compute SD
sd_correct = sd(score,
na.rm = T),
# compute number of responses
n = n(),
# compute SE (SD / sqrt of n)
se_correct = sd_correct/sqrt(n)) %>%
ungroup()
# print
View(df.descriptives2)There are 9 chapters and 2 book versions. The plot below shows that % correct seems to decline with chapter number (i.e., more advanced chapters are harder), but does not vary much by book. We also see increasing variance with increasing chapter number (i.e., there is more variability in more advanced chapters).
plot.pct_correct =
df.summary %>%
mutate(chapter = factor(chapter)) %>%
# chapter on x-axis
ggplot(aes(x = chapter,
# % correct on y-axis
y = (score*100),
# color each bar based on graph type
color = chapter)) +
# dot plot for individual participants
geom_point(aes(color = book),
position = position_jitter(width = .1,
height = 0),
alpha = .3,
size = 1) +
# line plot connecting dots across chapters
geom_line(aes(group = student_id, color = book),
linewidth = .2,
alpha = .03) +
# plot chapter means with 95% CIs
stat_summary(fun.data = mean_cl_boot,
geom = "errorbar",
width = 0.1,
linewidth = 1,
color = "black") +
stat_summary(fun = mean,
geom = "point",
shape = 18,
size = 3,
color = "black") +
# add title and axis labels
labs(title = "% correct by Chapter & Book",
x = "Chapter",
y = "% correct",
color = "Book") +
theme(legend.position = "bottom")
plot.pct_correctWe computed 3 engagement metrics: (1) how many chapter pages were fully completed in a given chapter, (2) how many attempts student took, on average, across questions within a chapter, and (3), how long students took, on average, to answer questions within a chapter.
Not all students answered all questions in all chapters, so we get a range of N = 526 to 562 data points.
The proportion of full pages completed ranged from 12.6 to 27.6% and fluctuated across chapters. The first chapter had the most variance.
The average amount of time spent ranged from 9 to 14.2 minutes. The SD was quite large (12-26 minutes).
The mean number of attempts was consistently close to 1, but ranged from 1.08 to 1.23. It was highest for the first chapter. SDs ranged from .08 to .34.
# Print descriptive stats: For each chapter, compute % correct (mean), SD, and SE
df.descriptives_engagement =
df.summary %>%
group_by(chapter) %>%
dplyr::summarise(n = n(),
# mean number of pages
mean_pages = mean(chapter_pages_completed,
na.rm = T),
# SD number of pages
sd_pages = sd(chapter_pages_completed,
na.rm = T),
# mean time spent
mean_time = mean(time_spent,
na.rm = T),
sd_time = sd(time_spent,
na.rm = T),
# SD time spent
sd_pages = sd(time_spent,
na.rm = T),
# mean time spent
mean_attempts = mean(no_attempts,
na.rm = T),
# SD time spent
sd_attempts = sd(no_attempts,
na.rm = T)) %>%
ungroup()
# print
View(df.descriptives_engagement)The figure below visualizes this:
# no of pages completed within chapter
plot.pages =
df.summary %>%
ggplot(aes(x = factor(chapter),
y = (chapter_pages_completed*100))) +
# plot chapter means with 95% CIs
stat_summary(fun.data = mean_cl_boot,
geom = "errorbar",
width = 0.1,
linewidth = 1,
color = "black") +
stat_summary(fun = mean,
geom = "point",
shape = 18,
size = 3,
color = "black") +
# dot plot for individual participants
geom_point(position = position_jitter(width = .3,
height = 0),
alpha = .1,
size = .5,
color = "purple") +
labs(x = "Chapter", y = "Mean % of chapter pages fully completed",
title = "Average # of pages completed within chapter")
# average no of attempts across questions within chapter
plot.attempts =
df.summary %>%
ggplot(aes(x = factor(chapter),
y = no_attempts)) +
# plot chapter means with 95% CIs
stat_summary(fun.data = mean_cl_boot,
geom = "errorbar",
width = 0.1,
linewidth = 1,
color = "black") +
stat_summary(fun = mean,
geom = "point",
shape = 18,
size = 3,
color = "black") +
# dot plot for individual participants
geom_point(position = position_jitter(width = .3,
height = 0),
alpha = .1,
size = .5,
color = "purple") +
labs(x = "Chapter", y = "Mean # of attempts across questions",
title = "Average # of attempts across questions within chapter")
# avreage time spent across questions within chapter
plot.time =
df.summary %>%
ggplot(aes(x = factor(chapter),
y = time_spent)) +
# plot chapter means with 95% CIs
stat_summary(fun.data = mean_cl_boot,
geom = "errorbar",
width = 0.1,
linewidth = 1,
color = "black") +
stat_summary(fun = mean,
geom = "point",
shape = 18,
size = 3,
color = "black") +
# dot plot for individual participants
geom_point(position = position_jitter(width = .3,
height = 0),
alpha = .1,
size = .5,
color = "purple") +
labs(x = "Chapter", y = "Mean time spent across questions (mins)",
title = "Mean time spent across questions within chapter")
# combine plots
plot.engagement =
plot.pages + plot.attempts + plot.time +
plot_layout(guides = "collect")
plot.engagementNext, we examine what predicts average student performance across chapters.
# create new df
df.compare =
df.reduced %>%
left_join(df.summary %>%
select(student_id, chapter, score_prev_chapter),
by = c("student_id", "chapter")) %>%
select(points_earned, chapter, book,
attempt, score_prev_chapter,
class_id, student_id) %>%
mutate(across(c(class_id, book, student_id),
factor)) %>%
# create binary value marking missing values for prev chapter scores
mutate(prev_missing = if_else(is.na(score_prev_chapter), 1, 0),
# fill first chapter values with 0
score_prev_chapter = replace_na(score_prev_chapter, 0)) %>%
drop_na()## # A tibble: 1 × 1
## r
## <dbl>
## 1 -0.456
# mixed effects model
lm.model1 =
glmer(points_earned ~
1 +
# fixed effect for chapter and book
chapter +
book +
# random intercept for class and student
(1 | class_id) +
(1 | student_id),
family = binomial(link = "logit"),
data = df.compare)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## points_earned ~ 1 + chapter + book + (1 | class_id) + (1 | student_id)
## Data: df.compare
##
## AIC BIC logLik deviance df.resid
## 179294.8 179345.6 -89642.4 179284.8 193307
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.3489 0.2548 0.3856 0.5166 2.2969
##
## Random effects:
## Groups Name Variance Std.Dev.
## student_id (Intercept) 0.436610 0.66076
## class_id (Intercept) 0.005672 0.07531
## Number of obs: 193312, groups: student_id, 560; class_id, 5
##
## Fixed effects:
## Estimate Std. Error
## (Intercept) 2.326583 0.062186
## chapter -0.168990 0.002396
## bookCollege / Statistics and Data Science (ABC) 0.094735 0.092010
## z value Pr(>|z|)
## (Intercept) 37.41 <2e-16 ***
## chapter -70.55 <2e-16 ***
## bookCollege / Statistics and Data Science (ABC) 1.03 0.303
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) chaptr
## chapter -0.206
## bC/SaDS(ABC -0.637 -0.009
## model term df1 df2 F.ratio Chisq p.value
## chapter 1 Inf 4976.555 4976.555 <.0001
## book 1 Inf 1.060 1.060 0.3032
## $chapter
## # Predicted probabilities of points_earned
##
## chapter | Predicted | 95% CI
## --------------------------------
## 1 | 0.78 | 0.60, 0.64
## 2 | 0.75 | 0.58, 0.61
## 3 | 0.72 | 0.56, 0.59
## 4 | 0.69 | 0.54, 0.57
## 5 | 0.66 | 0.53, 0.55
## 6 | 0.63 | 0.52, 0.54
## 7 | 0.60 | 0.51, 0.53
## 9 | 0.56 | 0.50, 0.52
##
## Adjusted for:
## * book = College / Advanced Statistics and Data Science (ABCD)
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## $book
## # Predicted probabilities of points_earned
##
## book | Predicted | 95% CI
## ------------------------------------------------------------------------------
## College / Advanced Statistics and Data Science (ABCD) | 0.66 | 0.53, 0.55
## College / Statistics and Data Science (ABC) | 0.68 | 0.54, 0.56
##
## Adjusted for:
## * chapter = 5.00
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## attr(,"class")
## [1] "ggalleffects" "list"
## attr(,"model.name")
## [1] "."
Including only number of attempts as an engagement predictor because the model does not converge using the other predictors.
# quick check of correlation
df.summary %>%
select(score, chapter,
time_spent, no_attempts, chapter_pages_completed) %>%
cor(use = "pairwise.complete.obs")## score chapter time_spent
## score 1.00000000 -0.4557170 0.086554242
## chapter -0.45571704 1.0000000 -0.041677097
## time_spent 0.08655424 -0.0416771 1.000000000
## no_attempts -0.02824059 -0.1815077 0.034590356
## chapter_pages_completed -0.18337909 0.3655008 -0.005932573
## no_attempts chapter_pages_completed
## score -0.02824059 -0.183379092
## chapter -0.18150775 0.365500846
## time_spent 0.03459036 -0.005932573
## no_attempts 1.00000000 -0.111370362
## chapter_pages_completed -0.11137036 1.000000000
# mixed effects model
lm.model2 =
glmer(points_earned ~
1 +
# fixed effect for chapter, book
chapter +
book +
# fixed effects for student engagement variable (only attempt)
attempt +
# random intercept for class and student
(1 | class_id) +
(1 | student_id),
family = binomial(link = "logit"),
data = df.compare)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## points_earned ~ 1 + chapter + book + attempt + (1 | class_id) +
## (1 | student_id)
## Data: df.compare
##
## AIC BIC logLik deviance df.resid
## 179250.0 179311.1 -89619.0 179238.0 193306
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.3726 0.2546 0.3854 0.5167 2.2897
##
## Random effects:
## Groups Name Variance Std.Dev.
## student_id (Intercept) 0.434013 0.65880
## class_id (Intercept) 0.005763 0.07592
## Number of obs: 193312, groups: student_id, 560; class_id, 5
##
## Fixed effects:
## Estimate Std. Error
## (Intercept) 2.383142 0.063186
## chapter -0.170192 0.002404
## bookCollege / Statistics and Data Science (ABC) 0.093061 0.092968
## attempt -0.040946 0.005894
## z value Pr(>|z|)
## (Intercept) 37.716 < 2e-16 ***
## chapter -70.810 < 2e-16 ***
## bookCollege / Statistics and Data Science (ABC) 1.001 0.317
## attempt -6.947 3.73e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) chaptr b/SaDS(
## chapter -0.214
## bC/SaDS(ABC -0.635 -0.008
## attempt -0.131 0.077 0.003
## model term df1 df2 F.ratio Chisq p.value
## chapter 1 Inf 5014.042 5014.042 <.0001
## book 1 Inf 1.002 1.002 0.3168
## attempt 1 Inf 48.264 48.264 <.0001
## $chapter
## # Predicted probabilities of points_earned
##
## chapter | Predicted | 95% CI
## --------------------------------
## 1 | 0.78 | 0.60, 0.64
## 2 | 0.75 | 0.58, 0.61
## 3 | 0.72 | 0.56, 0.59
## 4 | 0.69 | 0.54, 0.57
## 5 | 0.66 | 0.53, 0.55
## 6 | 0.63 | 0.52, 0.54
## 7 | 0.60 | 0.51, 0.53
## 9 | 0.56 | 0.51, 0.52
##
## Adjusted for:
## * book = College / Advanced Statistics and Data Science (ABCD)
## * attempt = 1.00
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## $book
## # Predicted probabilities of points_earned
##
## book | Predicted | 95% CI
## ------------------------------------------------------------------------------
## College / Advanced Statistics and Data Science (ABCD) | 0.66 | 0.53, 0.55
## College / Statistics and Data Science (ABC) | 0.68 | 0.54, 0.57
##
## Adjusted for:
## * chapter = 5.00
## * attempt = 1.00
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## $attempt
## # Predicted probabilities of points_earned
##
## attempt | Predicted | 95% CI
## --------------------------------
## 0 | 0.67 | 0.53, 0.56
## 10 | 0.60 | 0.51, 0.53
## 15 | 0.57 | 0.50, 0.53
## 25 | 0.53 | 0.49, 0.52
## 35 | 0.50 | 0.48, 0.52
## 45 | 0.48 | 0.47, 0.52
## 55 | 0.45 | 0.44, 0.52
## 70 | 0.37 | 0.36, 0.51
##
## Adjusted for:
## * chapter = 5.00
## * book = College / Advanced Statistics and Data Science (ABCD)
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## attr(,"class")
## [1] "ggalleffects" "list"
## attr(,"model.name")
## [1] "."
# plot
plot.model2 =
lm.model2 %>%
ggpredict(terms = c("chapter [1:9]",
"attempt [1, 2, 5]",
"book"),
ci_level = NA,
bias_correction = TRUE) |>
plot() +
ylim(0, 1) +
scale_x_continuous(limits = c(1, 9), breaks = 1:9) +
labs(
y = "Predicted probability of earning point",
x = "Chapter",
colour = "Attempts",
title = "Predicted probabilities from logistic mixed effects regression",
subtitle = "point_earned (1 or 0) ~ chapter + book + attempt"
) +
theme(legend.position = "bottom")
plot.model2# quick check of correlation
df.summary %>%
select(score, score_prev_chapter) %>%
cor(use = "pairwise.complete.obs")## score score_prev_chapter
## score 1.0000000 0.7184284
## score_prev_chapter 0.7184284 1.0000000
# mixed effects model
lm.model3 =
glmer(points_earned ~
1 +
# fixed effect for chapter, book
chapter +
book +
# fixed effect for prev chapter score
score_prev_chapter +
# random intercept for class and student
(1 | class_id) +
(1 | student_id),
family = binomial(link = "logit"),
data = df.compare)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## points_earned ~ 1 + chapter + book + score_prev_chapter + (1 |
## class_id) + (1 | student_id)
## Data: df.compare
##
## AIC BIC logLik deviance df.resid
## 178446.6 178507.6 -89217.3 178434.6 193306
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -8.0363 0.2201 0.3860 0.5249 2.2112
##
## Random effects:
## Groups Name Variance Std.Dev.
## student_id (Intercept) 0.581753 0.76273
## class_id (Intercept) 0.009534 0.09764
## Number of obs: 193312, groups: student_id, 560; class_id, 5
##
## Fixed effects:
## Estimate Std. Error
## (Intercept) 3.135537 0.081804
## chapter -0.177235 0.002511
## bookCollege / Statistics and Data Science (ABC) 0.132217 0.113242
## score_prev_chapter -1.009659 0.037101
## z value Pr(>|z|)
## (Intercept) 38.330 <2e-16 ***
## chapter -70.593 <2e-16 ***
## bookCollege / Statistics and Data Science (ABC) 1.168 0.243
## score_prev_chapter -27.214 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) chaptr b/SaDS(
## chapter -0.240
## bC/SaDS(ABC -0.586 -0.010
## scr_prv_chp -0.381 0.215 -0.013
## model term df1 df2 F.ratio Chisq p.value
## chapter 1 Inf 4983.325 4983.325 <.0001
## book 1 Inf 1.363 1.363 0.2430
## score_prev_chapter 1 Inf 740.586 740.586 <.0001
## $chapter
## # Predicted probabilities of points_earned
##
## chapter | Predicted | 95% CI
## --------------------------------
## 1 | 0.78 | 0.60, 0.65
## 2 | 0.75 | 0.58, 0.62
## 3 | 0.72 | 0.56, 0.59
## 4 | 0.69 | 0.54, 0.57
## 5 | 0.66 | 0.53, 0.56
## 6 | 0.63 | 0.52, 0.54
## 7 | 0.60 | 0.51, 0.53
## 9 | 0.55 | 0.50, 0.52
##
## Adjusted for:
## * book = College / Advanced Statistics and Data Science (ABCD)
## * score_prev_chapter = 0.76
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## $book
## # Predicted probabilities of points_earned
##
## book | Predicted | 95% CI
## ------------------------------------------------------------------------------
## College / Advanced Statistics and Data Science (ABCD) | 0.66 | 0.53, 0.56
## College / Statistics and Data Science (ABC) | 0.68 | 0.54, 0.57
##
## Adjusted for:
## * chapter = 5.00
## * score_prev_chapter = 0.76
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## $score_prev_chapter
## # Predicted probabilities of points_earned
##
## score_prev_chapter | Predicted | 95% CI
## -------------------------------------------
## 0.00 | 0.79 | 0.61, 0.66
## 0.20 | 0.76 | 0.58, 0.63
## 0.40 | 0.72 | 0.56, 0.60
## 0.60 | 0.69 | 0.54, 0.57
## 0.80 | 0.65 | 0.53, 0.55
## 1.00 | 0.62 | 0.52, 0.54
##
## Adjusted for:
## * chapter = 5.00
## * book = College / Advanced Statistics and Data Science (ABCD)
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## attr(,"class")
## [1] "ggalleffects" "list"
## attr(,"model.name")
## [1] "."
lm.model4 =
glmer(points_earned ~
1 +
# RQ1: fixed effect for chapter, book
chapter +
book +
# RQ2: fixed effect for student engagement (attempts)
attempt +
# RQ3: fixed effect for prev chapter score
score_prev_chapter +
prev_missing +
# random intercept for class and student
(1 | class_id) +
(1 | student_id),
family = binomial(link = "logit"),
data = df.compare)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## points_earned ~ 1 + chapter + book + attempt + score_prev_chapter +
## prev_missing + (1 | class_id) + (1 | student_id)
## Data: df.compare
##
## AIC BIC logLik deviance df.resid
## 177142.2 177223.6 -88563.1 177126.2 193304
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -10.8232 0.1614 0.3908 0.5187 2.3133
##
## Random effects:
## Groups Name Variance Std.Dev.
## student_id (Intercept) 0.328508 0.57316
## class_id (Intercept) 0.006215 0.07884
## Number of obs: 193312, groups: student_id, 560; class_id, 5
##
## Fixed effects:
## Estimate Std. Error
## (Intercept) 1.176616 0.088683
## chapter -0.103020 0.003307
## bookCollege / Statistics and Data Science (ABC) 0.111500 0.089329
## attempt -0.042295 0.005959
## score_prev_chapter 0.930898 0.067236
## prev_missing 2.762579 0.082018
## z value Pr(>|z|)
## (Intercept) 13.268 < 2e-16 ***
## chapter -31.148 < 2e-16 ***
## bookCollege / Statistics and Data Science (ABC) 1.248 0.212
## attempt -7.098 1.27e-12 ***
## score_prev_chapter 13.845 < 2e-16 ***
## prev_missing 33.683 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) chaptr b/SaDS( attmpt scr_p_
## chapter -0.586
## bC/SaDS(ABC -0.411 -0.019
## attempt -0.071 0.039 0.003
## scr_prv_chp -0.727 0.628 -0.022 -0.031
## prev_missng -0.556 0.562 -0.010 -0.023 0.720
## model term df1 df2 F.ratio Chisq p.value
## chapter 1 Inf 970.208 970.208 <.0001
## book 1 Inf 1.558 1.558 0.2120
## attempt 1 Inf 50.375 50.375 <.0001
## score_prev_chapter 1 Inf 191.692 191.692 <.0001
## prev_missing 1 Inf 1134.516 1134.516 <.0001
## $chapter
## # Predicted probabilities of points_earned
##
## chapter | Predicted | 95% CI
## --------------------------------
## 1 | 0.74 | 0.57, 0.60
## 2 | 0.72 | 0.56, 0.59
## 3 | 0.70 | 0.55, 0.58
## 4 | 0.68 | 0.54, 0.57
## 5 | 0.67 | 0.53, 0.56
## 6 | 0.65 | 0.53, 0.55
## 7 | 0.63 | 0.52, 0.54
## 9 | 0.60 | 0.51, 0.53
##
## Adjusted for:
## * book = College / Advanced Statistics and Data Science (ABCD)
## * attempt = 1.00
## * score_prev_chapter = 0.76
## * prev_missing = 0.07
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## $book
## # Predicted probabilities of points_earned
##
## book | Predicted | 95% CI
## ------------------------------------------------------------------------------
## College / Advanced Statistics and Data Science (ABCD) | 0.67 | 0.53, 0.56
## College / Statistics and Data Science (ABC) | 0.69 | 0.54, 0.57
##
## Adjusted for:
## * chapter = 5.00
## * attempt = 1.00
## * score_prev_chapter = 0.76
## * prev_missing = 0.07
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## $attempt
## # Predicted probabilities of points_earned
##
## attempt | Predicted | 95% CI
## --------------------------------
## 0 | 0.67 | 0.54, 0.56
## 10 | 0.60 | 0.51, 0.53
## 15 | 0.57 | 0.50, 0.53
## 25 | 0.53 | 0.49, 0.52
## 35 | 0.50 | 0.48, 0.52
## 45 | 0.48 | 0.46, 0.52
## 55 | 0.45 | 0.44, 0.52
## 70 | 0.36 | 0.36, 0.51
##
## Adjusted for:
## * chapter = 5.00
## * book = College / Advanced Statistics and Data Science (ABCD)
## * score_prev_chapter = 0.76
## * prev_missing = 0.07
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## $score_prev_chapter
## # Predicted probabilities of points_earned
##
## score_prev_chapter | Predicted | 95% CI
## -------------------------------------------
## 0.00 | 0.56 | 0.50, 0.52
## 0.20 | 0.58 | 0.51, 0.52
## 0.40 | 0.61 | 0.52, 0.53
## 0.60 | 0.64 | 0.52, 0.54
## 0.80 | 0.67 | 0.54, 0.56
## 1.00 | 0.70 | 0.55, 0.58
##
## Adjusted for:
## * chapter = 5.00
## * book = College / Advanced Statistics and Data Science (ABCD)
## * attempt = 1.00
## * prev_missing = 0.07
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## $prev_missing
## # Predicted probabilities of points_earned
##
## prev_missing | Predicted | 95% CI
## -------------------------------------
## 0 | 0.63 | 0.52, 0.54
## 1 | 0.96 | 0.88, 0.91
##
## Adjusted for:
## * chapter = 5.00
## * book = College / Advanced Statistics and Data Science (ABCD)
## * attempt = 1.00
## * score_prev_chapter = 0.76
## * class_id = 0 (population-level)
## * student_id = 0 (population-level)
##
## attr(,"class")
## [1] "ggalleffects" "list"
## attr(,"model.name")
## [1] "."
models = list(`Model 1` = lm.model1,
`Model 2` = lm.model2,
`Model 3` = lm.model3,
`Model 4 (Full)` = lm.model4)
# compute AIC and BIC
aic_bic =
performance::compare_performance(
models,
metrics = c("AIC", "BIC"),
rank = TRUE
) %>%
select(Model = Name, AIC = AIC_wt, BIC = BIC_wt)
# compute RMSE
get_rmse =
function(model, data) {
pred = predict(model, newdata = data, type = "response",
na.action = na.exclude)
ok = !is.na(pred) & !is.na(data$points_earned)
sqrt(mean((data$points_earned[ok] - pred[ok])^2))
}
rmse_tbl =
tibble(Model = names(models),
RMSE = map_dbl(models, get_rmse, data = df.compare)
)
# merge
df.metrics =
left_join(aic_bic, rmse_tbl, by = "Model") %>%
arrange(AIC) %>%
mutate(across(where(is.numeric), round, digits = 5))
View(df.metrics)We have a held-out 20% of data that contains entirely new items. We now have the models above make predictions for that held-out dataset (predicting proportion correct for these new items).
# Import held out data
df.heldout =
read.csv(paste0(data_dir, "filtered20_eoc_2023.csv"))
df.heldout =
df.heldout %>%
group_by(book,
institution_id, class_id,
student_id,
chapter) %>%
ungroup() %>%
# keep only relevant cols
select(c(book, release,
institution_id, class_id,
student_id,
chapter,
score)) %>%
mutate(across(c(class_id, student_id), factor))# predict held out students - can only use model 1 since we don't have other data
df.predict =
df.heldout %>%
mutate(pred = predict(lm.model1, newdata = ., type = "response",
re.form = NA, # fixed‑effects only
allow.new.levels = TRUE))
# aggregate to one row per student
df.submit =
df.predict %>%
group_by(id = student_id, chapter) %>%
summarise(score = mean(pred), .groups = "drop") %>%
select(score)
print(df.submit)## # A tibble: 1,183 × 1
## score
## <dbl>
## 1 0.905
## 2 0.889
## 3 0.872
## 4 0.851
## 5 0.829
## 6 0.803
## 7 0.775
## 8 0.744
## 9 0.711
## 10 0.905
## # ℹ 1,173 more rows